Imports
In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, roc_auc_score, auc, recall_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
In [2]:
df = pd.read_csv('../data/pima-indians-diabetes-data.csv', index_col=[0])
In [3]:
df.head()
Out[3]:
In [4]:
df.describe()
Out[4]:
Look at class distribution
In [5]:
len(df[df['class'] == 1]), len(df[df['class'] == 0])
Out[5]:
In [6]:
X = df.drop('class', axis=1).values
y = df['class'].values
In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
Train the model
In [8]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
Out[8]:
In [9]:
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)
In [10]:
print("AUC: %.3f" % roc_auc_score(y_test, y_pred_proba.T[1]))
Calculate confusion matrix
Predicted Positive | Predicted Negative | |
---|---|---|
Positive | TP | FN |
Negative | FP | TN |
In [11]:
confusion_matrix(y_test, y_pred, labels=[1,0])
Out[11]:
In [12]:
recall_score(y_test, y_pred, pos_label=1) # Low-moderate sensitivity
Out[12]:
In [13]:
recall_score(y_test, y_pred, pos_label=0) # High specificity
Out[13]:
In [ ]: